IMPORT THE LIBRARIES¶
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import plotly.express as px
warnings.filterwarnings('ignore')
READ THE DATASET AND DISPLAY THE FIRST FEW ROWS¶
In [4]:
df=pd.read_csv("cybersecurity_attacks.csv")
df.head()
Out[4]:
| Timestamp | Source IP Address | Destination IP Address | Source Port | Destination Port | Protocol | Packet Length | Packet Type | Traffic Type | Payload Data | ... | Action Taken | Severity Level | User Information | Device Information | Network Segment | Geo-location Data | Proxy Information | Firewall Logs | IDS/IPS Alerts | Log Source | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-05-30 06:33:58 | 103.216.15.12 | 84.9.164.252 | 31225 | 17616 | ICMP | 503 | Data | HTTP | Qui natus odio asperiores nam. Optio nobis ius... | ... | Logged | Low | Reyansh Dugal | Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ... | Segment A | Jamshedpur, Sikkim | 150.9.97.135 | Log Data | NaN | Server |
| 1 | 2020-08-26 07:08:30 | 78.199.217.198 | 66.191.137.154 | 17245 | 48166 | ICMP | 1174 | Data | HTTP | Aperiam quos modi officiis veritatis rem. Omni... | ... | Blocked | Low | Sumer Rana | Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ... | Segment B | Bilaspur, Nagaland | NaN | Log Data | NaN | Firewall |
| 2 | 2022-11-13 08:23:25 | 63.79.210.48 | 198.219.82.17 | 16811 | 53600 | UDP | 306 | Control | HTTP | Perferendis sapiente vitae soluta. Hic delectu... | ... | Ignored | Low | Himmat Karpe | Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ... | Segment C | Bokaro, Rajasthan | 114.133.48.179 | Log Data | Alert Data | Firewall |
| 3 | 2023-07-02 10:38:46 | 163.42.196.10 | 101.228.192.255 | 20018 | 32534 | UDP | 385 | Data | HTTP | Totam maxime beatae expedita explicabo porro l... | ... | Blocked | Medium | Fateh Kibe | Mozilla/5.0 (Macintosh; PPC Mac OS X 10_11_5; ... | Segment B | Jaunpur, Rajasthan | NaN | NaN | Alert Data | Firewall |
| 4 | 2023-07-16 13:11:07 | 71.166.185.76 | 189.243.174.238 | 6131 | 26646 | TCP | 1462 | Data | DNS | Odit nesciunt dolorem nisi iste iusto. Animi v... | ... | Blocked | Low | Dhanush Chad | Mozilla/5.0 (compatible; MSIE 5.0; Windows NT ... | Segment C | Anantapur, Tripura | 149.6.110.119 | NaN | Alert Data | Firewall |
5 rows × 25 columns
DATASET DESCRIPTION¶
In [6]:
df.shape
Out[6]:
(40000, 25)
In [7]:
df.columns
Out[7]:
Index(['Timestamp', 'Source IP Address', 'Destination IP Address',
'Source Port', 'Destination Port', 'Protocol', 'Packet Length',
'Packet Type', 'Traffic Type', 'Payload Data', 'Malware Indicators',
'Anomaly Scores', 'Alerts/Warnings', 'Attack Type', 'Attack Signature',
'Action Taken', 'Severity Level', 'User Information',
'Device Information', 'Network Segment', 'Geo-location Data',
'Proxy Information', 'Firewall Logs', 'IDS/IPS Alerts', 'Log Source'],
dtype='object')
In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 40000 entries, 0 to 39999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Timestamp 40000 non-null object 1 Source IP Address 40000 non-null object 2 Destination IP Address 40000 non-null object 3 Source Port 40000 non-null int64 4 Destination Port 40000 non-null int64 5 Protocol 40000 non-null object 6 Packet Length 40000 non-null int64 7 Packet Type 40000 non-null object 8 Traffic Type 40000 non-null object 9 Payload Data 40000 non-null object 10 Malware Indicators 20000 non-null object 11 Anomaly Scores 40000 non-null float64 12 Alerts/Warnings 19933 non-null object 13 Attack Type 40000 non-null object 14 Attack Signature 40000 non-null object 15 Action Taken 40000 non-null object 16 Severity Level 40000 non-null object 17 User Information 40000 non-null object 18 Device Information 40000 non-null object 19 Network Segment 40000 non-null object 20 Geo-location Data 40000 non-null object 21 Proxy Information 20149 non-null object 22 Firewall Logs 20039 non-null object 23 IDS/IPS Alerts 19950 non-null object 24 Log Source 40000 non-null object dtypes: float64(1), int64(3), object(21) memory usage: 7.6+ MB
In [9]:
df.isnull().sum() #checking the nullCount
Out[9]:
Timestamp 0 Source IP Address 0 Destination IP Address 0 Source Port 0 Destination Port 0 Protocol 0 Packet Length 0 Packet Type 0 Traffic Type 0 Payload Data 0 Malware Indicators 20000 Anomaly Scores 0 Alerts/Warnings 20067 Attack Type 0 Attack Signature 0 Action Taken 0 Severity Level 0 User Information 0 Device Information 0 Network Segment 0 Geo-location Data 0 Proxy Information 19851 Firewall Logs 19961 IDS/IPS Alerts 20050 Log Source 0 dtype: int64
Malware Indicators , Alerts/Warnings , Proxy Information , Firewall Logs & IDS/IPS Alerts have Null values.
In [11]:
df.dtypes #checking the datatypes
Out[11]:
Timestamp object Source IP Address object Destination IP Address object Source Port int64 Destination Port int64 Protocol object Packet Length int64 Packet Type object Traffic Type object Payload Data object Malware Indicators object Anomaly Scores float64 Alerts/Warnings object Attack Type object Attack Signature object Action Taken object Severity Level object User Information object Device Information object Network Segment object Geo-location Data object Proxy Information object Firewall Logs object IDS/IPS Alerts object Log Source object dtype: object
In [12]:
df.duplicated().sum() #checking for duplicates
Out[12]:
0
In [13]:
df.isnull().sum() #checking for Null values
Out[13]:
Timestamp 0 Source IP Address 0 Destination IP Address 0 Source Port 0 Destination Port 0 Protocol 0 Packet Length 0 Packet Type 0 Traffic Type 0 Payload Data 0 Malware Indicators 20000 Anomaly Scores 0 Alerts/Warnings 20067 Attack Type 0 Attack Signature 0 Action Taken 0 Severity Level 0 User Information 0 Device Information 0 Network Segment 0 Geo-location Data 0 Proxy Information 19851 Firewall Logs 19961 IDS/IPS Alerts 20050 Log Source 0 dtype: int64
Replacing the Null Values¶
In [15]:
df['Alerts/Warnings'] = df['Alerts/Warnings'].apply(lambda x: 'yes' if x == 'Alert Triggered' else 'no')
In [16]:
df['Malware Indicators'] = df['Malware Indicators'].apply(lambda x: 'None Detected' if pd.isna(x) else x)
In [17]:
df['Proxy Information'] = df['Proxy Information'].apply(lambda x: 'No Proxy' if pd.isna(x) else x)
In [18]:
df['Firewall Logs'] = df['Firewall Logs'].apply(lambda x: 'No Data' if pd.isna(x) else x)
In [19]:
df['IDS/IPS Alerts'] = df['IDS/IPS Alerts'].apply(lambda x: 'No Data' if pd.isna(x) else x)
In [20]:
df.isnull().sum()
Out[20]:
Timestamp 0 Source IP Address 0 Destination IP Address 0 Source Port 0 Destination Port 0 Protocol 0 Packet Length 0 Packet Type 0 Traffic Type 0 Payload Data 0 Malware Indicators 0 Anomaly Scores 0 Alerts/Warnings 0 Attack Type 0 Attack Signature 0 Action Taken 0 Severity Level 0 User Information 0 Device Information 0 Network Segment 0 Geo-location Data 0 Proxy Information 0 Firewall Logs 0 IDS/IPS Alerts 0 Log Source 0 dtype: int64
In [21]:
df.isnull().sum()
Out[21]:
Timestamp 0 Source IP Address 0 Destination IP Address 0 Source Port 0 Destination Port 0 Protocol 0 Packet Length 0 Packet Type 0 Traffic Type 0 Payload Data 0 Malware Indicators 0 Anomaly Scores 0 Alerts/Warnings 0 Attack Type 0 Attack Signature 0 Action Taken 0 Severity Level 0 User Information 0 Device Information 0 Network Segment 0 Geo-location Data 0 Proxy Information 0 Firewall Logs 0 IDS/IPS Alerts 0 Log Source 0 dtype: int64
Converting TimeStamp dataType from object to DateTime¶
In [23]:
df['Timestamp'].info()
<class 'pandas.core.series.Series'> RangeIndex: 40000 entries, 0 to 39999 Series name: Timestamp Non-Null Count Dtype -------------- ----- 40000 non-null object dtypes: object(1) memory usage: 312.6+ KB
In [24]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
In [25]:
df['Timestamp'].info()
<class 'pandas.core.series.Series'> RangeIndex: 40000 entries, 0 to 39999 Series name: Timestamp Non-Null Count Dtype -------------- ----- 40000 non-null datetime64[ns] dtypes: datetime64[ns](1) memory usage: 312.6 KB
Adding columns for easier plotting¶
In [27]:
df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month
df['Month_Name'] = df['Timestamp'].dt.month_name()
df['DayofWeek'] = df['Timestamp'].dt.dayofweek
df['Day'] = df['Timestamp'].dt.day
df['Hour'] = df['Timestamp'].dt.hour
df['Minute'] = df['Timestamp'].dt.minute
df['Second'] = df['Timestamp'].dt.second
In [28]:
df
Out[28]:
| Timestamp | Source IP Address | Destination IP Address | Source Port | Destination Port | Protocol | Packet Length | Packet Type | Traffic Type | Payload Data | ... | IDS/IPS Alerts | Log Source | Year | Month | Month_Name | DayofWeek | Day | Hour | Minute | Second | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-05-30 06:33:58 | 103.216.15.12 | 84.9.164.252 | 31225 | 17616 | ICMP | 503 | Data | HTTP | Qui natus odio asperiores nam. Optio nobis ius... | ... | No Data | Server | 2023 | 5 | May | 1 | 30 | 6 | 33 | 58 |
| 1 | 2020-08-26 07:08:30 | 78.199.217.198 | 66.191.137.154 | 17245 | 48166 | ICMP | 1174 | Data | HTTP | Aperiam quos modi officiis veritatis rem. Omni... | ... | No Data | Firewall | 2020 | 8 | August | 2 | 26 | 7 | 8 | 30 |
| 2 | 2022-11-13 08:23:25 | 63.79.210.48 | 198.219.82.17 | 16811 | 53600 | UDP | 306 | Control | HTTP | Perferendis sapiente vitae soluta. Hic delectu... | ... | Alert Data | Firewall | 2022 | 11 | November | 6 | 13 | 8 | 23 | 25 |
| 3 | 2023-07-02 10:38:46 | 163.42.196.10 | 101.228.192.255 | 20018 | 32534 | UDP | 385 | Data | HTTP | Totam maxime beatae expedita explicabo porro l... | ... | Alert Data | Firewall | 2023 | 7 | July | 6 | 2 | 10 | 38 | 46 |
| 4 | 2023-07-16 13:11:07 | 71.166.185.76 | 189.243.174.238 | 6131 | 26646 | TCP | 1462 | Data | DNS | Odit nesciunt dolorem nisi iste iusto. Animi v... | ... | Alert Data | Firewall | 2023 | 7 | July | 6 | 16 | 13 | 11 | 7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39995 | 2023-05-26 14:08:42 | 26.36.109.26 | 121.100.75.240 | 31005 | 6764 | UDP | 1428 | Control | HTTP | Quibusdam ullam consequatur consequuntur accus... | ... | Alert Data | Firewall | 2023 | 5 | May | 4 | 26 | 14 | 8 | 42 |
| 39996 | 2023-03-27 00:38:27 | 17.21.163.81 | 196.108.134.78 | 2553 | 28091 | UDP | 1184 | Control | HTTP | Quaerat neque esse. Animi expedita natus commo... | ... | No Data | Firewall | 2023 | 3 | March | 0 | 27 | 0 | 38 | 27 |
| 39997 | 2022-03-31 01:45:49 | 162.35.217.57 | 98.107.0.15 | 22505 | 25152 | UDP | 1043 | Data | DNS | Enim at aspernatur illum. Saepe numquam eligen... | ... | Alert Data | Server | 2022 | 3 | March | 3 | 31 | 1 | 45 | 49 |
| 39998 | 2023-09-22 18:32:38 | 208.72.233.205 | 173.79.112.252 | 20013 | 2703 | UDP | 483 | Data | FTP | Officiis dolorem sed harum provident earum dis... | ... | No Data | Server | 2023 | 9 | September | 4 | 22 | 18 | 32 | 38 |
| 39999 | 2023-10-10 11:59:52 | 14.102.21.108 | 109.198.45.7 | 50137 | 55575 | ICMP | 1175 | Control | HTTP | Eligendi omnis voluptate nihil voluptatibus do... | ... | Alert Data | Firewall | 2023 | 10 | October | 1 | 10 | 11 | 59 | 52 |
40000 rows × 33 columns
In [29]:
df.columns
Out[29]:
Index(['Timestamp', 'Source IP Address', 'Destination IP Address',
'Source Port', 'Destination Port', 'Protocol', 'Packet Length',
'Packet Type', 'Traffic Type', 'Payload Data', 'Malware Indicators',
'Anomaly Scores', 'Alerts/Warnings', 'Attack Type', 'Attack Signature',
'Action Taken', 'Severity Level', 'User Information',
'Device Information', 'Network Segment', 'Geo-location Data',
'Proxy Information', 'Firewall Logs', 'IDS/IPS Alerts', 'Log Source',
'Year', 'Month', 'Month_Name', 'DayofWeek', 'Day', 'Hour', 'Minute',
'Second'],
dtype='object')
In [30]:
df.groupby('Month_Name')['Attack Type'].count()
Out[30]:
Month_Name April 3421 August 3615 December 2675 February 3232 January 3378 July 3623 June 3609 March 3678 May 3595 November 2703 October 2989 September 3482 Name: Attack Type, dtype: int64
In [31]:
df.groupby('Year')['Attack Type'].count()
Out[31]:
Year 2020 10573 2021 10538 2022 10750 2023 8139 Name: Attack Type, dtype: int64
In [32]:
df['Geo-location Data'].value_counts().idxmax()
Out[32]:
'Ghaziabad, Meghalaya'
In [33]:
df['Network Segment'].value_counts().idxmax()
Out[33]:
'Segment C'
In [34]:
sns.histplot(data=df, x="Month", hue='Attack Type')
Out[34]:
<Axes: xlabel='Month', ylabel='Count'>
In [35]:
plt=px.histogram(df,x='Month',color='Attack Type')
plt.show()
In [36]:
attacks_by_month = df.groupby(['Month', 'Attack Type']).size().unstack()
attacks_by_month.plot(kind='bar', stacked=True, figsize=(10, 6))
Out[36]:
<Axes: xlabel='Month'>
FILTERING DATA BY ATTACK TYPE¶
In [38]:
malware_data = df[df['Attack Type'] == 'Malware']['Packet Length']
intrusion_data = df[df['Attack Type'] == 'Intrusion']['Packet Length']
ddos_data = df[df['Attack Type'] == 'DDoS']['Packet Length']
SHOWING PACKET LENGTH DISTRIBUTIONS FOR EACH ATTACK TYPE USING HISTOGRAMS¶
In [40]:
malwareHistogram = px.histogram(x=malware_data, title='Malware Packet Length Distribution', labels={'x': 'Packet Length'}, nbins=30)
malwareHistogram.show()
intrusion_histogram = px.histogram(x=intrusion_data , title='Intrusion Packet Length Distribution' , labels={'x': 'Packet Length'} , nbins=30)
intrusion_histogram.show()
ddos_histogram = px.histogram(x=ddos_data , title='DDoS Packet Length Distribution' , labels={'x': 'Packet Length'} , nbins=30)
ddos_histogram.show()
INSIGHTS¶
Since there are only two types of browsers in the recorded lsit, lets split the string so that we create a new dataframe with only browser names
In [43]:
def identify_browser(device_info):
if device_info.startswith("Mozilla"):
return "Mozilla"
else:
return "Opera"
df['Browser'] = df['Device Information'].apply(identify_browser)
In [44]:
plt = px.histogram(df, x= 'Browser', color = 'Attack Type', title = 'Number of Malware Attacks by Browser and Devices')
plt.show()
In [45]:
def identify_os(device_info):
if "Windows" in device_info:
return "Windows"
elif "Macintosh" in device_info:
return "Mac OS X"
elif "Linux" in device_info:
return "Linux"
elif "Android" in device_info:
return "Android"
elif "iPod" in device_info:
return "iPod"
elif "iPhone" in device_info:
return "iPhone"
elif "iPad" in device_info:
return "iPad"
else:
return "Other"
df['OperatingSystem'] = df['Device Information'].apply(identify_os)
In [46]:
plt = px.histogram(df, x ='OperatingSystem', color= 'Attack Type', title = 'Platform Distribution')
plt.show()
In [47]:
df['Log Source'].value_counts().idxmax()
Out[47]:
'Firewall'
In [48]:
plt = px.histogram(df, x ='Traffic Type', color= 'Severity Level', title = 'Traffic Type')
plt.show()
In [49]:
plt = px.pie(df, names = 'OperatingSystem', title = 'Platform Distribution')
plt.show()
In [50]:
plt = px.histogram(df, x ='Protocol', y= 'Packet Length', title = 'Platform Distribution')
plt.show()
In [51]:
plt = px.histogram(df, x ='Traffic Type', color= 'Attack Type', title = 'Platform Distribution')
plt.show()
In [52]:
plt = px.histogram(df, x ='Action Taken', color= 'Severity Level', title = 'Platform Distribution')
plt.show()